#!/usr/bin/python
# -*- coding:utf-8 -*-

from html_download import Downloader
from html_parser import Parser
from url_manager import UrlManager
from html_outputer import HtmlOuter
from ext import setup_coding


class SpiderMain(object):
    def __init__(self):
        self.downloader = Downloader()
        self.parser = Parser()
        self.url_manager = UrlManager()
        self.outer = HtmlOuter()

    def run(self, base_url):
        self.url_manager.add_url(base_url)

        count = 0

        while self.url_manager.has_url():
            count += 1
            url = self.url_manager.get_url()
            print '%d open url:%s' % (count, url)
            html = self.downloader.do_download(url)
            if html is None:
                continue
            links, data = self.parser.parser(html, url)
            self.url_manager.add_urls(links)
            if data:
                self.outer.write(data)
                # time.sleep(1)  # 模拟用户访问，正常用户不会在一秒钟之内打开十几个网页


if __name__ == '__main__':
    setup_coding()

    root_url = 'http://nanchong.11467.com/'
    spider = SpiderMain()
    spider.run(root_url)
